While examining the NYPD crime complaint data, we expect crime rate
to be indicative by the socioeconomic status of a specific neighborhood.
In this section, we calculated the crime rate for each neighborhood each
year, visualized various socioeconomic status by each police precint
(neighborhood), and displayed the top neighborhoods with highest crime
rate, along with what their crime composition looks like. We are also
interested in whether crime rate can be predictive of rent in
corresponding neighborhoods.
Since the primary data used did not have neighborhood information, we
used NYPD precinct data to classify neighborhoods, following Golub et
al. (2006)
# Import and clead NYPD data
#df_nypd = read_csv("https://www.dropbox.com/scl/fi/kf2zk4t1onxzm2vo3lpkq/NYPD_Complaint_Data_Historic.csv?rlkey=ly36vi9v66sno80eir6rohlwn&dl=1", na = "(null)") |> # some values are coded as "(null)" in the df; rewrite them as NA
# janitor::clean_names()
df_nypd = read_csv("data/NYPD_Complaint_Data_Historic.csv") |>
janitor::clean_names()
prec_neighbor = read_csv("data/nyc_prec_neighborhood.csv")
#Merging neighborhood information with NYPD data, keep only relevant variables
nypd_ses_df = df_nypd |>
select(cmplnt_num, cmplnt_fr_dt, addr_pct_cd, crm_atpt_cptd_cd, law_cat_cd, susp_age_group, susp_race, susp_sex, vic_age_group, vic_race, vic_sex, ofns_desc, pd_desc, latitude, longitude) |>
rename(precinct = addr_pct_cd) |>
mutate(cmplnt_fr_dt = as.Date(cmplnt_fr_dt, format = "%m/%d/%Y"),
year = format(cmplnt_fr_dt, "%Y")) |>
left_join(prec_neighbor, by = "precinct")
#Import and clean socio-economic data. neighbor_SES has SES indicators (except rent) for each neighborhood
neighbor_ses = readxl::read_excel("data/neighorhood_indicators.xlsx", sheet = "Data") |>
janitor::clean_names() |>
filter(region_type == "Sub-Borough Area") |>
rename(neighborhood = region_name) |>
select(neighborhood, year, pop_num, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_edu_nohs_pct, pop_pov_pct, pop_race_asian_pct, pop_race_black_pct, pop_race_hisp_pct, pop_race_white_pct, pop_foreign_pct) |>
filter(year %in% c(2017, 2018, 2019, 2020, 2021, 2022))
#neighbor_rent has rent data for each neighborhood
neighbor_rent = readxl::read_excel("data/neighorhood_indicators.xlsx", sheet = "Data") |>
janitor::clean_names() |>
filter(region_type == "Sub-Borough Area") |>
rename(neighborhood = region_name) |>
filter(year == "2017-2021") |>
select(neighborhood, gross_rent_0_1beds, gross_rent_2_3beds)
#ses_df has crime rate and SES indicators for each neighborhood
ses_df = nypd_ses_df |>
group_by(year, borough, neighborhood) |>
summarise(crime_num = n())
ses_df = ses_df |> merge(neighbor_ses, by = c("year", "neighborhood")) |>
mutate(crime_rate = (crime_num/pop_num) * 100,000) |>
left_join(neighbor_rent, by = "neighborhood")
In this section, we mapped the crime rate (with a specific focused on
“felony assault”) and the socioeconomic status in each police precinct.
The data used for this analysis were 2021 NYPD crime complaint data and
2020 Census population data.
Crimes categorized under “felony assault” share common characteristics
such as physical injury, intentional and reckless actions, the use of
deadly weapons or dangerous instruments, and others as defined by New
York Penal Law.
The serious crime rate was determined by dividing the total number of
felony assault complaints in each precinct by the total population in
that precinct. Note that due to lack of data, data on crime complaints
and socioeconomic status are based on 2021 records, while data on total
population are from 2020.
# NYC neighborhoods borders
nyc = read_sf(here::here("data", "Police_Precincts.geojson")) |>
select(-shape_area, -shape_leng) |>
mutate(
precinct = as.double(precinct)
)
# prepare data set for mapping
# population data by precinct (2020 census, P1_001N: total population)
df_pop = read_csv(here::here("data", "nyc_precinct_2020pop.csv")) |>
rename(pop = P1_001N) |>
select(precinct, pop)
# select ses variables
ses_heat = neighbor_ses |>
filter(year == 2021) |> # visualize 2021 data
mutate(
pop16_unemp_pct = pop16_unemp_pct * 100,
pop_edu_collp_pct = pop_edu_collp_pct * 100,
pop_pov_pct = pop_pov_pct * 100
) |>
select(neighborhood, pop_num, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_pov_pct)
# merge with nypd dataset and calculate relevant indicators
nypd_ses_heat = nypd_ses_df |>
filter(year == 2021) |>
left_join(ses_heat, by = "neighborhood") |> # combine with ses data
st_as_sf(
# which columns to use as coordinates
coords = c("longitude", "latitude"),
# keep the coordinate columns
remove = FALSE,
# projection system
crs = 4326
) |>
merge(df_pop, by = "precinct") |>
group_by(precinct) |>
mutate(
"s_crime_rate" = (sum(ofns_desc == "FELONY ASSAULT") / pop) * 100000, # calculate crime rate per 100,000 using 2020 population census data by precinct
"violation" = (sum(law_cat_cd == "VIOLATION") / n()) * 100, # the rate of violation among all complaints
"misdemeanor" = (sum(law_cat_cd == "MISDEMEANOR") / n()) * 100, # the rate of misdemeanor among all complaints
"felony" = (sum(law_cat_cd == "FELONY") / n()) * 100 # the rate of felony among all complaints
) |>
select(precinct, neighborhood, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_pov_pct, geometry, s_crime_rate, felony, misdemeanor, violation)
# spatial join
df_spj = nypd_ses_heat |>
# spatial join
st_join(
# only need these columns from nyc tibble
nyc |> select(precinct, geometry),
# join rows where there is some overlap between a dock and a precinct
join = st_intersects,
left = FALSE
) |>
select(-precinct.y) |>
rename(precinct = precinct.x)
count_by_neighborhood = df_spj |>
# remove geometry for fast counting
st_drop_geometry() |>
distinct() |>
# join the counts into the nyc neighborhood object
right_join(nyc, by=c("precinct" = "precinct")) |>
mutate(
"pre_nei" = paste(precinct, "-", neighborhood, sep = " ")
) |>
st_as_sf() |>
filter(precinct != 22) # remove central park data
This map shows the proportion of different offense levels among all complaints reported in each precinct/neighborhood.
# tmap
tmap_mode("view") # set tmap to the view mode
tm_shape(count_by_neighborhood) +
tm_polygons(col = c("violation", "misdemeanor", "felony"), id = "pre_nei", title = c("Violation (%)", "Misdemeanor (%)", "Felony (%)"), popup.vars = c("Violation: " = "violation", "Misdemeanor: " = "misdemeanor", "Felony: " = "felony")) +
tm_facets(sync = TRUE, ncol = 3)